Van den Berge et al. recently showed that observation weights can facilitate application of standard RNA-seq differential expression tools such as DESeq and edgeR for single cell RNA-seq experiments. Our objective here is to show that cell weighting by the estimated dropout proportion can also facilitate the application of differential expression tools to scRNA-seq experiments. I’ll first start off with a simulation.
I will assume the following model. \[
\begin{aligned}
y_{ij} &= \text{count of gene } j \text{ in cell } i;
\notag \\
y_{ij} &\sim \pi_{ij} 1(0) + (1 - \pi_{ij}) \text{Pois}(\lambda_{ij});
\notag \\
\lambda_{ij} &\sim d*\phi_{ij}*\lambda_{0j}*\log \text{Normal} ( -0.5, 1^{2});
\notag \\
\phi_{ij} &= 5 \text{ for } 10\% \text{ of the genes in group 1 and } = 0.2 \text{ for } 10\% \text{ of the genes in group 1, and 0 otherwise};
\notag \\
\lambda_{0j} &\sim \text{Gamma}(\text{shape } = 0.1, \text{ rate } = 0.1);
\notag \\
\theta_{i} &\sim \log \text{Normal}(-2.5^{2}/2, 2.5^2 ).
\notag \\
d &= \text{ depth factor};
\notag \\
\pi_{ij} &= 1/(1 + \text{exp}(\log(1/\pi_{0j} - 1) + 0.5*(\phi_{i,j} - \bar{\phi})))
\notag \\
\pi_{0j} &\sim \text{Beta}(2, 4)
\end{aligned}
\]
set.seed(1)
n_cells = 1000
cellLabels = factor(c(paste0("treatment", 1:(n_cells/2)), paste0("control", (n_cells/2 + 1):n_cells)))
n_genes = 20000
theta = exp(rnorm(n_cells, -(2.5^2)/2, 2.5))
hist(theta, breaks = 50, col = "grey")

lambda0 = rgamma(n_genes, shape = 0.1, rate = 0.1)
hist(lambda0, breaks = 100, col = "grey")

phi = matrix(1, nrow = n_genes, ncol = n_cells)
dropout_baseline = rbeta(n_cells, 2, 4)
hist(dropout_baseline, breaks = 100)

dropout_baseline = -log(1/dropout_baseline - 1) # inverse of logistic function
depth_factor = 0.05
for(i in 1:(0.5*n_cells)){
for(j in 1:n_genes){
if(j <= 0.1*n_genes){
phi[j, i] = 5
}
if(j <= 0.2*n_genes & j > 0.1*n_genes){
phi[j, i] = 0.2
}
}
}
geneConditions = c(rep(1, times = 0.1*n_genes),
rep(-1, times = 0.1*n_genes),
rep(0, times = 0.8*n_genes))
phibar = mean(phi)
dropout_indicator = mat.or.vec(n_genes, n_cells)
dropout_chance = mat.or.vec(n_genes, n_cells)
for(i in 1:n_genes){
for(j in 1:n_cells){
dropout_chance[i,j] = 1/(1 + exp(-dropout_baseline[j] + (phi[i,j] - phibar)))
dropout_indicator[i,j] = rbinom(1, 1, dropout_chance[i,j])
}
}
hist(dropout_chance, breaks = 101)

counts = mat.or.vec(n_genes, n_cells)
colnames(counts) = cellLabels
rownames(counts) = paste0("gene", 1:n_genes)
for(i in 1:n_cells){
for(j in 1:n_genes){
if(dropout_indicator[j, i] == 0){
#counts[j, i] = rnbinom(1, mu = depth_factor*phi[j, i]*lambda0[j]*theta[i], size = 0.1)
counts[j, i] = rpois(1, depth_factor*phi[j, i]*lambda0[j]*theta[i]*exp(rnorm(1, -0.5, sd = 1)))
}
}
}
hist(apply(counts, 2, sum), breaks = 100)

hist(log(apply(counts, 2, sum)), breaks = 100)

mean(apply(counts, 2, sum))
## [1] 1307.174
max(counts)
## [1] 16556
#dropout_chance.vs.logcounts = data.frame(dropout_chance = c(dropout_chance), logcounts = log(c(counts + 1)))
#library(ggplot2)
#ggplot(dropout_chance.vs.logcounts[sample.int(dim(dropout_chance.vs.logcounts)[1], dim(dropout_chance.vs.logcounts)[1]/10, replace = FALSE), ], aes(x = dropout_chance, y = logcounts)) + geom_point(alpha = 0.5) + stat_smooth()
hist(apply(counts, 2, function(x) sum(x == 0)), breaks = 100)

cells2keep = which(colSums(counts) > 50)
n_cells = length(cells2keep)
n_cells
## [1] 436
genes2keep = intersect(which(rowSums(counts) > 10), which(rowSums((counts > 0)) > 3))
geneConditions = geneConditions[genes2keep]
n_genes = length(genes2keep)
n_genes
## [1] 5262
counts = counts[genes2keep, cells2keep]
dim(counts)
## [1] 5262 436
cellLabels = cellLabels[cells2keep]
#counts = counts[ , cells2keep]
dropout_indicator = dropout_indicator[genes2keep, cells2keep]
#dropout_indicator = dropout_indicator[ , cells2keep]
phi = phi[genes2keep, cells2keep]
true_dropout = colSums(phi == 0 | dropout_indicator == 1)/n_genes
hist(true_dropout, breaks = 100)
observed_dropout = colSums((counts == 0))/n_genes
library(ggplot2)

d = data.frame(true_dropout = true_dropout, observed_dropout = observed_dropout)
ggplot(d, aes(y = true_dropout, x = observed_dropout)) + geom_point(alpha = 0.8)

cor(true_dropout, observed_dropout)
preseq_estimated_dropout = rep(0, times = n_cells)
chao_estimated_dropout = rep(0, times = n_cells)
chaobunge_estimated_dropout = rep(0, times = n_cells)
chaolee_estimated_dropout = rep(0, times = n_cells)
#pnpmle_estimated_dropout = rep(0, times = n_cells)
breakaway_estimated_dropout = rep(0, times = n_cells)
jackknife_estimated_dropout = rep(0, times = n_cells)
negbinom_estimated_dropout = rep(0, times = n_cells)
depth = rep(0, times = n_cells)
library(SPECIES)
library(breakaway)
library(preseqR)
neg_binom_species <- function(y_hist){
ztnb_fit = preseqR.ztnb.em(y_hist)
return(sum(y_hist[,2])*(1 + 1/(1 - (1 + ztnb_fit$mu/ztnb_fit$size)^(-ztnb_fit$size))))
}
for(i in 1:n_cells){
y = counts[,i]
y = y[(y > 0)]
y_unique = sort(unique(y), decreasing = FALSE)
y_hist = data.frame(j = y_unique, n_j = sapply(y_unique, function(x) length(which(y == x))))
write.table(y_hist, file = "y_hist.txt", sep = "\t", quote = FALSE, col.names = FALSE, row.names = FALSE)
if(dim(y_hist)[1] > 1){
system("~/preseq/preseq bound_pop -H y_hist.txt -o preseq_estim.txt")
preseq_estim = read.table(file = "preseq_estim.txt", header = TRUE)
preseq_estimated_dropout[i] = 1 - preseq_estim$median_estimated_unobs/n_genes
chao_estimated_dropout[i] = 1 - chao1984(y_hist)$Nhat/n_genes
chaobunge_estimated_dropout[i] = 1 - ChaoBunge(y_hist)$Nhat/n_genes
chaolee_estimated_dropout[i] = 1 - ChaoLee1992(y_hist, method = "ACE")$Nhat/n_genes
#pnpmle_estimated_dropout[i] = 1 - pnpmle(y_hist, C=0, dis = 0)$Nhat/n_genes
negbinom_estimated_dropout[i] = 1 - neg_binom_species(y_hist)/n_genes
if(dim(y_hist)[1] == 6 & y_hist[6, 1] == 6){
# need sufficient number of counts for breakaway
breakaway_estimated_dropout[i] = 1 - breakaway(y_hist, plot = FALSE, print = FALSE, answers = TRUE)$est/n_genes
}
jackknife_estimated_dropout[i] = 1 - jackknife(y_hist)$Nhat/n_genes
}
}
d = data.frame(d, preseq_estimated_dropout = preseq_estimated_dropout,
chao_estimated_dropout = chao_estimated_dropout,
chaolee_estimated_dropout = chaolee_estimated_dropout,
chaobunge_estimated_dropout = chaobunge_estimated_dropout,
#pnpmle_estimated_dropout = pnpmle_estimated_dropout,
breakaway_estimated_dropout = breakaway_estimated_dropout,
jackknife_estimated_dropout = jackknife_estimated_dropout,
negbinom_estimated_dropout = negbinom_estimated_dropout,
depth = colSums(counts))
d$preseq_estimated_dropout[which(d$preseq_estimated_dropout <= 0 | is.na(d$preseq_estimated_dropout))] = d$observed_dropout[which(d$preseq_estimated_dropout <= 0)]
#d$pnpmle_estimated_dropout[which(d$pnpmle_estimated_dropout <= 0)] = d$observed_dropout[which(d$pnpmle_estimated_dropout <= 0)]
d$chaobunge_estimated_dropout[which(d$chaobunge_estimated_dropout <= 0 | is.na(d$chaobunge_estimated_dropout))] = d$observed_dropout[which(d$chaobunge_estimated_dropout <= 0)]
## Warning in
## d$chaobunge_estimated_dropout[which(d$chaobunge_estimated_dropout <= :
## number of items to replace is not a multiple of replacement length
d$chao_estimated_dropout[which(d$chao_estimated_dropout <= 0 | is.na(d$chao_estimated_dropout))] = d$observed_dropout[which(d$chao_estimated_dropout <= 0)]
d$chaolee_estimated_dropout[which(d$chaolee_estimated_dropout <= 0 | is.na(d$chaolee_estimated_dropout))] = d$observed_dropout[which(d$chaolee_estimated_dropout <= 0)]
d$breakaway_estimated_dropout[which(d$breakaway_estimated_dropout <= 0 | is.na(d$breakaway_estimated_dropout))] = d$observed_dropout[which(d$breakaway_estimated_dropout <= 0)]
d$jackknife_estimated_dropout[which(d$jackknife_estimated_dropout <= 0 | is.na(d$jackknife_estimated_dropout))] = d$observed_dropout[which(d$jackknife_estimated_dropout <= 0)]
d$negbinom_estimated_dropout[which(d$negbinom_estimated_dropout <= 0 | is.na(d$negbinom_estimated_dropout))] = d$observed_dropout[which(d$negbinom_estimated_dropout <= 0)]
d$chaobunge_estimated_dropout[which(d$chaobunge_estimated_dropout > 1)] = 1
d$chaolee_estimated_dropout[which(d$chaolee_estimated_dropout > 1)] = 1
y = data.frame(algorithm = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"#, "pnpmle"
), correlations = c(cor(d$depth, d$observed_dropout),
cor(d$depth, d$chaolee_estimated_dropout),
cor(d$depth, d$breakaway_estimated_dropout),
cor(d$depth, d$chaobunge_estimated_dropout),
cor(d$depth, d$chao_estimated_dropout),
cor(d$depth, d$jackknife_estimated_dropout),
cor(d$depth, d$preseq_estimated_dropout),
cor(d$depth, d$negbinom_estimated_dropout)
#,cor(d$depth, d$pnpmle_estimated_dropout)
))
y$algorithm = factor(y$algorithm, levels = c("observed","ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"#, "pnpmle"
))
ggplot(y, aes(x = algorithm, y = correlations)) + geom_bar(stat="identity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

y
## algorithm correlations
## 1 observed -0.3813217
## 2 ACE -0.2596501
## 3 breakaway -0.3289994
## 4 ChaoBunge -0.1934357
## 5 Chao -0.2707138
## 6 jackknife -0.2058381
## 7 preseq -0.2112565
## 8 negbinom -0.3088861
c = data.frame(correlations = c(cor(d$true_dropout, d$observed_dropout),
cor(d$true_dropout, d$chaolee_estimated_dropout),
cor(d$true_dropout, d$breakaway_estimated_dropout),
cor(d$true_dropout, d$chaobunge_estimated_dropout),
cor(d$true_dropout, d$chao_estimated_dropout),
cor(d$true_dropout, d$jackknife_estimated_dropout),
cor(d$true_dropout, d$preseq_estimated_dropout),
cor(d$true_dropout, d$negbinom_estimated_dropout)
#, cor(d$true_dropout, d$pnpmle_estimated_dropout)
),
algorithm = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"
#, "pnpmle"
))
c
## correlations algorithm
## 1 0.2270614 observed
## 2 0.4025388 ACE
## 3 0.2034117 breakaway
## 4 0.0746364 ChaoBunge
## 5 0.4031629 Chao
## 6 0.2925732 jackknife
## 7 0.3471022 preseq
## 8 0.1446541 negbinom
meanSquareError = data.frame(meanSquareError = c(mean((d$true_dropout - d$observed_dropout)^2),
mean((d$true_dropout - d$chaolee_estimated_dropout)^2),
mean((d$true_dropout - d$breakaway_estimated_dropout)^2),
mean((d$true_dropout - d$chaobunge_estimated_dropout)^2),
mean((d$true_dropout - d$chao_estimated_dropout)^2),
mean((d$true_dropout - d$jackknife_estimated_dropout)^2),
mean((d$true_dropout - d$preseq_estimated_dropout)^2),
mean((d$true_dropout - d$negbinom_estimated_dropout)^2)),
algorithm = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "preseq", "negbinom"))
meanSquareError$algorithm = factor(meanSquareError$algorithm, levels = c("observed", "ACE", "breakaway", "ChaoBunge", "Chao", "jackknife", "negbinom", "preseq"))
meanSquareError
## meanSquareError algorithm
## 1 0.3711687 observed
## 2 0.2378723 ACE
## 3 0.3632310 breakaway
## 4 0.3700155 ChaoBunge
## 5 0.2413438 Chao
## 6 0.2559039 jackknife
## 7 0.2165532 preseq
## 8 0.3408163 negbinom
ggplot(meanSquareError, aes(y = meanSquareError, x = algorithm)) + geom_bar(stat="identity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

ggplot(c, aes(x = algorithm, y = correlations)) + geom_bar(stat="identity") + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

ggplot(d, aes(x = observed_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = preseq_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = negbinom_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

#ggplot(d, aes(x = pnpmle_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)
ggplot(d, aes(x = chaolee_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = jackknife_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = chao_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = breakaway_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

ggplot(d, aes(x = chaobunge_estimated_dropout, y = true_dropout)) + geom_point(alpha = 0.8) + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + xlim(0, 1) + ylim(0, 1) + geom_abline(intercept = 0, slope = 1)

sampleWeights = (1 - preseq_estimated_dropout)
hist(sampleWeights, breaks = 50, col = "grey")

library(edgeR)
## Warning: package 'edgeR' was built under R version 3.5.2
## Loading required package: limma
condition = factor(as.numeric(startsWith(sapply(cellLabels, toString), "treat")))
design <- model.matrix(~condition)
naiveDGE = DGEList(counts)
naiveDGE = calcNormFactors(naiveDGE)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
naiveV = cpm(naiveDGE, log=TRUE, prior.count=3)
naiveFit = lmFit(naiveV, design=design)
naiveEB = eBayes(naiveFit, trend=TRUE, robust=TRUE)
decideTests(naiveEB)
## TestResults matrix
## (Intercept) condition1
## gene4 1 0
## gene8 1 1
## gene10 1 1
## gene13 1 1
## gene15 1 0
## 5257 more rows ...
naiveEB.results = topTable(naiveEB, number = dim(counts)[1])
## Removing intercept from test coefficients
dim(naiveEB.results)
## [1] 5262 6
head(naiveEB.results)
## logFC AveExpr t P.Value adj.P.Val B
## gene489 0.7250079 12.72025 9.119931 1.635843e-18 8.607806e-15 31.33298
## gene479 0.7441094 12.74522 8.873296 1.142117e-17 2.174486e-14 29.44287
## gene168 0.8022791 12.76054 8.862781 1.239730e-17 2.174486e-14 29.36313
## gene282 0.6847587 12.62025 8.777003 2.414043e-17 3.175674e-14 28.71521
## gene1146 0.5904708 12.55575 8.726298 3.571678e-17 3.758834e-14 28.33440
## gene1726 0.5506475 12.53994 8.612241 8.569685e-17 7.515614e-14 27.48378
length(geneConditions)
## [1] 5262
sum(geneConditions > 0)
## [1] 732
sum(naiveEB.results$logFC > 0)
## [1] 1497
sum(naiveEB.results$adj.P.Val < 0.05)
## [1] 941
sum(geneConditions > 0 & naiveEB.results$logFC > 0 & naiveEB.results$adj.P.Val < 0.05)
## [1] 430
sum(geneConditions > 0 & naiveEB.results$logFC > 0 & naiveEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.5874317
sum(geneConditions < 0)
## [1] 435
sum(naiveEB.results$logFC < 0)
## [1] 3765
sum(naiveEB.results$adj.P.Val < 0.05)
## [1] 941
sum(geneConditions < 0 & naiveEB.results$logFC < 0 & naiveEB.results$adj.P.Val < 0.05)
## [1] 155
sum(geneConditions < 0 & naiveEB.results$logFC < 0 & naiveEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.2117486
# corrected by dropout rate
condition = factor(as.numeric(startsWith(sapply(cellLabels, toString), "treat")))
design <- model.matrix(~condition)
correctedDGE = DGEList(counts)
correctedDGE = calcNormFactors(correctedDGE)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
correctedV = cpm(correctedDGE, log=TRUE, prior.count=3)
correctedFit = lmFit(correctedV, design=design, weights = 1 - preseq_estimated_dropout)
correctedEB = eBayes(correctedFit, trend=TRUE, robust=TRUE)
decideTests(correctedEB)
## TestResults matrix
## (Intercept) condition1
## gene4 1 0
## gene8 1 1
## gene10 1 1
## gene13 1 1
## gene15 1 0
## 5257 more rows ...
correctedEB.results = topTable(correctedEB, number = dim(counts)[1])
## Removing intercept from test coefficients
dim(correctedEB.results)
## [1] 5262 6
head(correctedEB.results)
## logFC AveExpr t P.Value adj.P.Val B
## gene1146 0.8294813 12.55575 11.19792 5.125029e-26 2.696790e-22 48.26536
## gene168 1.1149151 12.76054 11.04620 1.981103e-25 5.212281e-22 46.94055
## gene489 0.9466060 12.72025 10.87410 9.066274e-25 1.590225e-21 45.45057
## gene282 0.9365995 12.62025 10.81169 1.568305e-24 2.063105e-21 44.91375
## gene1430 0.9072340 12.64168 10.32991 1.011474e-22 9.182279e-20 40.83350
## gene479 0.9681175 12.74522 10.32586 1.047010e-22 9.182279e-20 40.79970
length(geneConditions)
## [1] 5262
sum(geneConditions > 0)
## [1] 732
sum(correctedEB.results$logFC > 0)
## [1] 1802
sum(correctedEB.results$adj.P.Val < 0.05)
## [1] 1399
sum(geneConditions > 0 & correctedEB.results$logFC > 0 & correctedEB.results$adj.P.Val < 0.05)
## [1] 462
sum(geneConditions > 0 & correctedEB.results$logFC > 0 & correctedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.6311475
sum(geneConditions < 0)
## [1] 435
sum(correctedEB.results$logFC < 0)
## [1] 3460
sum(correctedEB.results$adj.P.Val < 0.05)
## [1] 1399
sum(geneConditions < 0 & correctedEB.results$logFC < 0 & correctedEB.results$adj.P.Val < 0.05)
## [1] 309
sum(geneConditions < 0 & correctedEB.results$logFC < 0 & correctedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.4221311
# corrected by dropout rate
condition = factor(as.numeric(startsWith(sapply(cellLabels, toString), "treat")))
design <- model.matrix(~condition)
AWcorrectedDGE = DGEList(counts)
AWcorrectedDGE = calcNormFactors(AWcorrectedDGE)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
AWcorrectedV = cpm(AWcorrectedDGE, log=TRUE, prior.count=3)
aw = arrayWeights(AWcorrectedV, design)
AWcorrectedFit = lmFit(AWcorrectedV, design=design, weights = aw)
AWcorrectedEB = eBayes(AWcorrectedFit, trend=TRUE, robust=TRUE)
decideTests(AWcorrectedEB)
## TestResults matrix
## (Intercept) condition1
## gene4 1 0
## gene8 1 0
## gene10 1 1
## gene13 1 0
## gene15 1 0
## 5257 more rows ...
AWcorrectedEB.results = topTable(AWcorrectedEB, number = dim(counts)[1])
## Removing intercept from test coefficients
dim(AWcorrectedEB.results)
## [1] 5262 6
head(AWcorrectedEB.results)
## logFC AveExpr t P.Value adj.P.Val B
## gene1726 0.3846869 12.53994 9.520331 1.153496e-19 6.069696e-16 33.84367
## gene479 0.4269378 12.74522 8.465514 3.827358e-16 1.006978e-12 25.86449
## gene489 0.3812028 12.72025 8.024063 9.367140e-15 1.642996e-11 22.72370
## gene1350 0.3212715 12.55499 7.830323 3.666658e-14 4.823489e-11 21.38470
## gene123 0.4079216 12.70924 7.745864 6.597067e-14 6.942753e-11 20.80868
## gene633 0.4810947 12.84121 7.675749 1.070462e-13 9.387953e-11 20.33411
length(geneConditions)
## [1] 5262
sum(geneConditions > 0)
## [1] 732
sum(AWcorrectedEB.results$logFC > 0)
## [1] 2315
sum(AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 372
sum(geneConditions > 0 & AWcorrectedEB.results$logFC > 0 & AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 275
sum(geneConditions > 0 & AWcorrectedEB.results$logFC > 0 & AWcorrectedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0.3756831
sum(geneConditions < 0)
## [1] 435
sum(AWcorrectedEB.results$logFC < 0)
## [1] 2947
sum(AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 372
sum(geneConditions < 0 & AWcorrectedEB.results$logFC < 0 & AWcorrectedEB.results$adj.P.Val < 0.05)
## [1] 0
sum(geneConditions < 0 & AWcorrectedEB.results$logFC < 0 & AWcorrectedEB.results$adj.P.Val < 0.05)/sum(geneConditions > 0)
## [1] 0
library(pander)
x = data.frame(p0.01 = c(sum(geneConditions != 0),
sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01)),
p0.05 = c(sum(geneConditions != 0), sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05)),
p0.1 = c(sum(geneConditions != 0),
sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1)))
rownames(x) = c("total", "naive", "preseq corrected", "AW corrected")
pander::pander(x)
| total |
1167 |
1167 |
1167 |
| naive |
402 |
585 |
768 |
| preseq corrected |
643 |
771 |
771 |
| AW corrected |
221 |
275 |
314 |
x = data.frame(p0.01 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01)),
p0.05 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05)),
p0.1 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1)))
rownames(x) = c("naive", "preseq corrected", "AW corrected")
pander::pander(x)
| naive |
264 |
356 |
422 |
| preseq corrected |
360 |
628 |
948 |
| AW corrected |
42 |
97 |
143 |
ggplot(data.frame(arrayWeights = aw, preseqWeights = 1 - preseq_estimated_dropout), aes(x = arrayWeights, y = preseqWeights)) + geom_point() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black"))

cor(aw, 1 - preseq_estimated_dropout)
## [1] -0.1920448
# observation weights with edgeR
# from https://github.com/statOmics/zinbwaveZinger/blob/master/timebenchmark/islam/benchmark_islam.Rmd
library(zinbwave)
## Warning: package 'zinbwave' was built under R version 3.5.2
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
##
## plotMA
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: IRanges
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 3.5.2
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
##
## aperm, apply
## Loading required package: SingleCellExperiment
## Warning: package 'SingleCellExperiment' was built under R version 3.5.2
##
## Attaching package: 'SingleCellExperiment'
## The following object is masked from 'package:edgeR':
##
## cpm
library(edgeR)
# compute zinbwave weights
zinb = zinbwave::zinbFit(Y = counts, X = design, epsilon = 1e12)
weights = zinbwave::computeObservationalWeights(zinb, counts)
d = edgeR::DGEList(counts)
d = edgeR::calcNormFactors(d)
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
## Warning in max(abs(logR)): no non-missing arguments to max; returning -Inf
d$weights = weights
d = edgeR::estimateDisp(d, design)
fit = edgeR::glmFit(d,design)
lrt = zinbwave::glmWeightedF(fit,coef=2, independentFiltering = TRUE)
pvals = lrt$table$padjFilter
#pvals = p.adjust(pvals, method = "BH")
sum(geneConditions > 0)
## [1] 732
sum(lrt$table$logFC > 0)
## [1] 1077
sum(pvals < 0.05)
## [1] 827
sum(geneConditions > 0 & lrt$table$logFC > 0 & pvals < 0.05)
## [1] 76
sum(geneConditions > 0 & lrt$table$logFC > 0 & pvals < 0.05)/sum(geneConditions > 0)
## [1] 0.1038251
sum(geneConditions > 0)
## [1] 732
sum(lrt$table$logFC < 0)
## [1] 4185
sum(pvals < 0.05)
## [1] 827
sum(geneConditions > 0 & lrt$table$logFC < 0 & pvals < 0.05)
## [1] 1
sum(geneConditions > 0 & lrt$table$logFC < 0 & pvals < 0.05)/sum(geneConditions > 0)
## [1] 0.00136612
# true positives
x = data.frame(p0.01 = c(sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.01)),
p0.05 = c(sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.05)),
p0.1 = c(sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.1)))
rownames(x) = c("naive", "preseq corrected", "AW corrected", "observation weights")
pander::pander(x)
| naive |
402 |
585 |
768 |
| preseq corrected |
643 |
771 |
771 |
| AW corrected |
221 |
275 |
314 |
| observation weights |
162 |
256 |
345 |
s = exp(seq(from = -8, to = -1, length = 101))
x = data.frame(algorithm = rep(c("naive", "preseq corrected", "AW corrected","observation weights"), each = length(s)), TruePositives = c(sapply(s, function(p) sum(sign(geneConditions) == sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) == sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) == sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < p)) ), FDRthresh = rep(s, times = 4))
ggplot(x, aes(x = FDRthresh, y = TruePositives, col = algorithm)) + geom_line() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("True positives vs adjust p-value threshold")

x = data.frame(algorithm = rep(c("naive", "preseq corrected", "AW corrected","observation weights"), each = length(s)), FalsePositives = c(sapply(s, function(p) sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < p)), sapply(s, function(p) sum(sign(geneConditions) != sign(lrt$table$logFC) & pvals < p)) ), FDRthresh = rep(s, times = 4))
ggplot(x, aes(x = FDRthresh, y = FalsePositives, col = algorithm)) + geom_line() + theme_bw() + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), axis.line = element_line(colour = "black")) + ggtitle("False positives vs adjust p-value threshold")

#False positives
x = data.frame(p0.01 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.01),
sum(sign(geneConditions) != sign(lrt$table$logFC) & pvals < 0.01)),
p0.05 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.05),
sum(sign(geneConditions) != sign(lrt$table$logFC) & pvals < 0.05)),
p0.1 = c(sum(sign(geneConditions) != sign(naiveEB.results$logFC) & naiveEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) != sign(correctedEB.results$logFC) & correctedEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) != sign(AWcorrectedEB.results$logFC) & AWcorrectedEB.results$adj.P.Val < 0.1),
sum(sign(geneConditions) == sign(lrt$table$logFC) & pvals < 0.1)))
rownames(x) = c("naive", "preseq corrected", "AW corrected", "observation weights")
pander::pander(x)
| naive |
264 |
356 |
422 |
| preseq corrected |
360 |
628 |
948 |
| AW corrected |
42 |
97 |
143 |
| observation weights |
246 |
571 |
345 |